In [1]:
import os
import sys
import joblib
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

sys.path.append('../')
from functionality import funs

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier,  GradientBoostingClassifier

from sklearn.gaussian_process.kernels import RBF

from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, LabelEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, make_scorer, classification_report

# Set the random generators for reproducibility.
os.environ['PYTHONHASHSEED']= str(2124)

# Set a custom color palette:
colors = ['red','darksalmon','olive','darkseagreen','dodgerblue','navy']

color = ['maroon','red','tomato','darksalmon','firebrick',
         'darkseagreen','seagreen','lightseagreen','olive','green',
         'dodgerblue','deepskyblue','navy','blue','royalblue']

my_palette = sns.color_palette(color)
sns.set_palette(my_palette)

working = os.getcwd()
dirname = os.path.dirname(working)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)

Higher Education Students Performance Evaluation¶

  • The data was collected from Turkish students at two faculties: Faculty of Engineering and Faculty of Educational Sciences students in 2019.

  • The goal is to create an ML model that can predict student performance given the data taken from a survey.

  • The grades are in categorical –AA, BA, BB, CB, CC, DC, DD, and Fail– hence it should be model as a mutli-class classification.


Data Set Information

The data contains results from a survey with columns 1-10 relate to personal questions, 11-16 are family related, and the remaining questions include education habits.

In [2]:
# Load data.
data = pd.read_csv('../data/data.csv', dtype={'Course ID':object})

1. Exploratory Data Analysis¶

In [3]:
ages = data.Age.sort_values().unique()
grades = data.Grade.sort_values().unique()
hours = ['none','<5 hours','6-10 hours','11-20 hours','more than 20 hours']
scholarship = data.Scholarship.sort_values().unique()
notes = ['always', 'sometimes', 'never']
listening = ['always', 'sometimes', 'never']
attendance = ['always', 'sometimes']
exams1 = data['Preparation to Midterm Exams 1'].sort_values().unique()
exams2 = data['Preparation to Midterm Exams 2'].sort_values().unique()
fathers = ['Ph.D.','MSc.','university','high school','primary school','secondary school']
mothers = ['Ph.D.','MSc.','university','high school','primary school','secondary school']
courses = data['Course ID'].sort_values().unique()
transport = data['Transportation to University'].unique()
courses = data['Course ID'].sort_values().unique()
In [4]:
props = (data.groupby('Grade')['Grade']
         .count().to_frame('Count')
         .reset_index())

fig = make_subplots(
    rows=1, 
    cols=1,
    specs=[[{'type':'domain'}]])

fig.add_trace(
    go.Pie(
        labels=props['Grade'], 
        values=props['Count'],
        marker={'colors':color[0:len(grades)+1]},
        sort=False),
    1, 1)

fig.update_traces(textposition='inside', textinfo='percent+label')

# Update layout settings for the figure.
fig.update_layout(
    title={'text':'Grades % Distribution','font_size':20},
    showlegend=False, 
    height=650,
    width=1650,
    template='plotly_white')

The outcome data –the grades– shows an imbalanced distribution. Whilst DD has 25% of the data, BA and CB have less than 10% and Fail represents only 5.5% of the whole data –only eight points–. This eventually will present a problem as the model will have few data points to train on predicting the Fail grade, but more data points to train the model on predicting the DD grade.

In [5]:
fig = px.bar((data
                .pivot_table(index='Grade', columns='Course ID', values='Student ID', fill_value=0, aggfunc='count')
                .unstack()
                .to_frame('Count')
                .reset_index()), 
             x='Course ID', 
             y='Count',
             color='Grade',
             category_orders={'Grade': grades},
             labels={'Count':'# Students'},
             color_discrete_map=dict(zip(grades, np.flip(color)[0:len(grades)])),
             )

fig.update_layout(
    showlegend=True, 
    height=500, 
    width=1650, 
    template='plotly_white',
    title='Grades by Course ID',
    yaxis_range = [0,70])

fig.show()
In [6]:
funs.eda_plotter(data, 'Age', ages, facet_col='Sex')
In [7]:
funs.eda_plotter(data, 'Weekly Study Hours', hours)
In [8]:
funs.eda_plotter(data, 'Scholarship', scholarship)
In [9]:
funs.eda_plotter(data, ['Attendance to Classes','Listening in Classes','Taking Notes in Classes'], [attendance,listening,notes])
In [10]:
funs.eda_plotter(data, ['Preparation to Midterm Exams 1','Preparation to Midterm Exams 2'], [exams1, exams2])
In [11]:
funs.eda_plotter(data, 'Fathers Education', fathers)
In [12]:
funs.eda_plotter(data, 'Mothers Education', fathers)

2. Data Preparation¶

In [13]:
# Filter data with one apereance.
data = data.loc[data['Scholarship'] != 'None']
data = data.loc[data['Transportation to University'] != 'bicycle']
data = data.loc[data['Accommodation Type'] != 'other']
data = data.loc[data['Fathers Education'] != 'Ph.D.']
In [14]:
# Create the X matrix and y outcome.
y = data['Grade']
X = data.drop(['Student ID','Grade'], axis=1)
In [15]:
# Get the classes from the outcome. 
classes = y.sort_values().unique()

Error, Data Transformation, K-Fold and Metrics

In [16]:
# Create a transformer to one hot encode categorical variables.
Transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object)), 
    remainder="passthrough")

# Create a stratified shuffled split.
sss = StratifiedShuffleSplit(3, test_size=0.1, random_state=6064)

# Set the list of metrics to asses the models' performances.
metrics = {'accuracy':make_scorer(accuracy_score, greater_is_better=True),
           'precision_macro':make_scorer(precision_score, greater_is_better=True, average='macro', zero_division=0),
           'recall_macro':make_scorer(recall_score, greater_is_better=True, average='macro', zero_division=0),
           'auc': make_scorer(roc_auc_score, greater_is_better=True, average='macro', needs_proba=True, multi_class='ovr', labels=classes)}

Precision and recall provide insights into the model's performance for each class individually, while accuracy gives an overall view of the model's correctness. Since this is a multi-class classification problem, precision and recall are calculated individually for each class and then averaged.

Precision: measures the proportion of correctly predicted grades out of all grades predicted as a specific grade. In this case, when predicting an AA grade what proportion of all predicted AA grades where truly AA grades. The procedure is repeated for each individual grade. High precision indicates that the model is good at correctly identifying a specific grade without misclassifying with the other grades. However, it doesn't consider the case when a grade was not predicted as the real grade.

Recall: measures the proportion of correctly predicted grades out of all actual grades in the set. In this case, when predicting an AA grade what proportion of all AA grades were predicted as AA grades. The procedure is repeated for each individual grade. High recall indicates that the models good at predicting most of the grades from each category to its real category.

Accuracy: measures the overall correctness of the model's predictions across all grades. It calculates the proportion of correctly predicted grades out of the total number of grades. It provides an overall assessment of the model's performance, considering both correct predictions for identifying the real and false grade category. However, it may not be the most informative metric when dealing with imbalanced datasets, where the number of instances in each class varies significantly.

In [17]:
# Create a label binarizer fitted using y.
binarizer = LabelBinarizer().fit(y)

# Create a label encoder fitted using y.
encoder = LabelEncoder().fit(y)

Train and Test Subsets

Since the data is imbalanced when splitting to the train and test sets the imbalance has to taken into account. The even split is needed so that the data can train using all possible outcomes – with a distribution comparable to the expected in none seen data.

In [18]:
# Create a train and test set for X and y. Set test size to 20% of the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=1234, stratify=data[['Grade']])
In [19]:
weights = funs.grades_distribution([y_train, y_test], ['Train Set', 'Test Set'])

3. Classification Study¶

The following list shows the estimators –and their parameters– that are studied to identify the best possible model:

log = LogisticRegression(penalty=None, random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l1 = LogisticRegression(penalty='l1', random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l2 = LogisticRegression(penalty='l2', random_state=6064, solver='sag', max_iter=10500, multi_class='multinomial', n_jobs=-1)
net = LogisticRegression(penalty='elasticnet', random_state=6064, solver='saga', max_iter=10500, multi_class='multinomial', n_jobs=-1, l1_ratio=0.5)
sgd = SGDClassifier(loss='modified_huber', penalty=None, max_iter=7500, n_jobs=-1, random_state=6064)
mlp = MLPClassifier(solver='adam', max_iter=4500, random_state=6064)
dtc = DecisionTreeClassifier(random_state=6064)
rfc = RandomForestClassifier(random_state=6064, n_jobs=1)
etc = ExtraTreeClassifier(random_state=6064)
ets = ExtraTreesClassifier(random_state=6064, n_jobs=1)
abc = AdaBoostClassifier(random_state=6064)
gpc = GaussianProcessClassifier(kernel=RBF(0.05), random_state=6064, n_jobs=1)
gbc = GradientBoostingClassifier(loss='log_loss', random_state=6064)
svc = SVC(kernel=RBF(), probability=True)
In [20]:
log = LogisticRegression(penalty=None, random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l1 = LogisticRegression(penalty='l1', random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l2 = LogisticRegression(penalty='l2', random_state=6064, solver='sag', max_iter=10500, multi_class='multinomial', n_jobs=-1)
net = LogisticRegression(penalty='elasticnet', random_state=6064, solver='saga', max_iter=10500, multi_class='multinomial', n_jobs=-1, l1_ratio=0.5)
sgd = SGDClassifier(loss='modified_huber', penalty=None, max_iter=7500, n_jobs=-1, random_state=6064)
mlp = MLPClassifier(solver='adam', max_iter=4500, random_state=6064)
dtc = DecisionTreeClassifier(random_state=6064)
rfc = RandomForestClassifier(random_state=6064, n_jobs=1)
etc = ExtraTreeClassifier(random_state=6064)
ets = ExtraTreesClassifier(random_state=6064, n_jobs=1)
abc = AdaBoostClassifier(random_state=6064)
gpc = GaussianProcessClassifier(kernel=RBF(0.05), random_state=6064, n_jobs=1)
gbc = GradientBoostingClassifier(loss='log_loss', random_state=6064)
svc = SVC(kernel=RBF(), probability=True)
In [21]:
validation = []
In [22]:
estimators = [log, l1, l2, net, sgd, mlp, dtc, rfc, etc, abc, ets, gpc, gbc, svc]

3.1 Standard Estimators¶

In [23]:
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss)
In [24]:
funs.performance_plotter(train, validate, 'Validation', color)
In [25]:
validation.append(validate)
In [26]:
validate.style.hide(axis='index')
Out[26]:
Model Accuracy Recall weighted Precision weighted AUC
logisticregression 0.250000 0.229200 0.188900 0.625700
logisticregression_l1 0.333300 0.291700 0.223600 0.714700
logisticregression_l2 0.250000 0.194400 0.156500 0.644300
logisticregression_elasticnet 0.305600 0.263900 0.210800 0.683800
sgd 0.277800 0.250000 0.193500 0.551800
mlp 0.361100 0.312500 0.241300 0.635900
decisiontree 0.166700 0.131900 0.103500 0.503500
randomforest 0.416700 0.347200 0.291700 0.695700
extratree 0.138900 0.152800 0.121500 0.512600
adaboost 0.250000 0.159700 0.084600 0.607700
extratrees 0.416700 0.375000 0.343100 0.723000
gaussianprocess 0.083300 0.125000 0.010400 0.500000
gradientboosting 0.388900 0.395800 0.309000 0.661400
svc 0.250000 0.125000 0.031200 0.326000

3.2 Standard Estimators & Feature Selection with Variance Threshold of 0.10¶

In [27]:
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss, variance_threshold=0.10)
In [28]:
funs.performance_plotter(train, validate, 'Validation', color)
In [29]:
validation.append(validate)
In [30]:
validate.style.hide(axis='index')
Out[30]:
Model Accuracy Recall weighted Precision weighted AUC
logisticregression 0.277800 0.243100 0.214600 0.632400
logisticregression_l1 0.333300 0.284700 0.229900 0.716900
logisticregression_l2 0.277800 0.222200 0.199300 0.637900
logisticregression_elasticnet 0.277800 0.201400 0.165300 0.673600
sgd 0.250000 0.180600 0.194400 0.572400
mlp 0.250000 0.250000 0.156200 0.600300
decisiontree 0.250000 0.284700 0.211800 0.586200
randomforest 0.250000 0.222200 0.160100 0.660200
extratree 0.194400 0.138900 0.116000 0.510100
adaboost 0.194400 0.201400 0.168500 0.605000
extratrees 0.250000 0.208300 0.194400 0.706400
gaussianprocess 0.083300 0.125000 0.010400 0.500000
gradientboosting 0.305600 0.291700 0.215300 0.661900
svc 0.250000 0.125000 0.031200 0.333800

3.3 Standard Estimators & Feature Selection with Variance Threshold of 0.20¶

In [31]:
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss, variance_threshold=0.2)
In [32]:
funs.performance_plotter(train, validate, 'Validation', color)
In [33]:
validation.append(validate)
In [34]:
validate.style.hide(axis='index')
Out[34]:
Model Accuracy Recall weighted Precision weighted AUC
logisticregression 0.416700 0.354200 0.354200 0.700400
logisticregression_l1 0.305600 0.250000 0.219200 0.677200
logisticregression_l2 0.277800 0.277800 0.213200 0.675700
logisticregression_elasticnet 0.250000 0.215300 0.173600 0.703400
sgd 0.222200 0.194400 0.135400 0.582000
mlp 0.333300 0.298600 0.183300 0.651500
decisiontree 0.250000 0.180600 0.152800 0.534000
randomforest 0.277800 0.250000 0.163200 0.674900
extratree 0.305600 0.284700 0.181900 0.591800
adaboost 0.222200 0.194400 0.140300 0.737700
extratrees 0.305600 0.243100 0.181900 0.741300
gaussianprocess 0.083300 0.125000 0.010400 0.500000
gradientboosting 0.250000 0.236100 0.148600 0.649300
svc 0.250000 0.125000 0.031200 0.320800

3.4. Grid Search for Best Estimators¶

In [35]:
# Create a pipeline with data transformation and variance threshold.
preprocessor = make_pipeline(Transformer, VarianceThreshold())
In [36]:
if not 'logisticregression.joblib' in os.listdir('../working/best_estimators'):

    log_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, log),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'logisticregression__multi_class':['multinomial','ovr']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = log_cv.fit(X_train, y_train)
In [37]:
if not 'logisticregression_l1.joblib' in os.listdir('../working/best_estimators'):

    l1_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, l1),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'logisticregression__C':[0.1,0.5,1,5,10,50,100],
            'logisticregression__multi_class':['multinomial','ovr']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = l1_cv.fit(X_train, y_train)
In [38]:
if not 'logisticregression_l2.joblib' in os.listdir('../working/best_estimators'):

    l2_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, l2),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'logisticregression__C':[0.1,0.5,1,5,10,50,100],
            'logisticregression__multi_class':['multinomial','ovr']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = l2_cv.fit(X_train, y_train)
In [39]:
if not 'logisticregression_elasticnet.joblib' in os.listdir('../working/best_estimators'):

    net_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, net),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'logisticregression__C':[0.1,0.5,1,5,10,50,100],
            'logisticregression__l1_ratio':np.arange(0.1,1.1,0.1),
            'logisticregression__multi_class':['multinomial','ovr']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = net_cv.fit(X_train, y_train)
In [40]:
if not 'sgd_l1.joblib' in os.listdir('../working/best_estimators'):

    sgd_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, sgd),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'sgdclassifier__loss':['log_loss','modified_huber'],
            'sgdclassifier__penalty':['l2', 'l1', 'elasticnet', None],
            'sgdclassifier__alpha':np.arange(0.0001,0.11,0.025)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = sgd_cv.fit(X_train, y_train)
In [41]:
if not 'mlp.joblib' in os.listdir('../working/best_estimators'):

    mlp_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, mlp),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'mlpclassifier__hidden_layer_sizes':[(50,), (100,), (150,), (200,)],
            'mlpclassifier__activation':['identity', 'logistic', 'tanh', 'relu'],
            'mlpclassifier__alpha':[0.1, 0.05, 0.01, 0.001]},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = mlp_cv.fit(X_train, y_train)
In [42]:
if not 'decisiontree.joblib' in os.listdir('../working/best_estimators'):

    dtc_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, dtc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'decisiontreeclassifier__criterion':['gini','entropy','log_loss'],
            'decisiontreeclassifier__max_depth':np.arange(5, 11),
            'decisiontreeclassifier__min_samples_split':np.arange(2, 5),
            'decisiontreeclassifier__min_samples_leaf':np.arange(1, 5),
            'decisiontreeclassifier__class_weight':[None, weights],
            'decisiontreeclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = dtc_cv.fit(X_train, y_train)
In [43]:
if not 'randomforest.joblib' in os.listdir('../working/best_estimators'):

    rfc_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, rfc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'randomforestclassifier__criterion':['gini','entropy','log_loss'],
            'randomforestclassifier__max_depth':np.arange(5, 11),
            'randomforestclassifier__min_samples_split':np.arange(2, 5),
            'randomforestclassifier__min_samples_leaf':np.arange(1, 5),
            'randomforestclassifier__class_weight':[None, weights],
            'randomforestclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = rfc_cv.fit(X_train, y_train)
In [44]:
if not 'extratree.joblib' in os.listdir('../working/best_estimators'):

    etc_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, etc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'extratreeclassifier__criterion':['gini','entropy','log_loss'],
            'extratreeclassifier__max_depth':np.arange(5, 11),
            'extratreeclassifier__min_samples_split':np.arange(2, 5),
            'extratreeclassifier__min_samples_leaf':np.arange(1, 5),
            'extratreeclassifier__class_weight':[None, weights],
            'extratreeclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=False)
    
    _ = etc_cv.fit(X_train, y_train)
In [45]:
if not 'extratrees.joblib' in os.listdir('../working/best_estimators'):

    ets_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, ets),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'extratreesclassifier__criterion':['gini','entropy','log_loss'],
            'extratreesclassifier__n_estimators':np.arange(5, 11),
            'extratreesclassifier__max_depth':np.arange(5, 11),
            'extratreesclassifier__min_samples_split':np.arange(2, 5),
            'extratreesclassifier__min_samples_leaf':np.arange(1, 5),
            'extratreesclassifier__class_weight':[None, weights],
            'extratreesclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=False)
    
    _ = ets_cv.fit(X_train, y_train)
In [46]:
if not 'adaboost.joblib' in os.listdir('../working/best_estimators'):

    abc_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, abc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'adaboostclassifier__estimator':[dtc_cv.best_estimator_.steps[1][1], 
                                             rfc_cv.best_estimator_.steps[1][1], 
                                             etc_cv.best_estimator_.steps[1][1], 
                                             ets_cv.best_estimator_.steps[1][1]],
            'adaboostclassifier__n_estimators':np.arange(10, 110, 10),
            'adaboostclassifier__learning_rate':np.arange(0.1,1.1,0.1)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = abc_cv.fit(X_train, y_train)
In [47]:
if not 'gaussianprocess.joblib' in os.listdir('../working/best_estimators'):

    gpc_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, gpc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'gaussianprocessclassifier__kernel':[RBF(0.001), RBF(0.005), RBF(0.01), RBF(0.05)]},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=False)
    
    _ = gpc_cv.fit(X_train, y_train)
In [48]:
if not 'gradientboosting.joblib' in os.listdir('../working/best_estimators'):

    gbc_cv = RandomizedSearchCV(    
        estimator = make_pipeline(preprocessor, gbc),
        param_distributions={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25, 0.025),
            'gradientboostingclassifier__learning_rate':[0.01, 0.05, 0.1, 0.5, 1, 5],
            'gradientboostingclassifier__n_estimators':[8,9,10,11,12,13,14,15],
            'gradientboostingclassifier__min_samples_split':np.arange(2,6),
            'gradientboostingclassifier__min_samples_leaf':np.arange(2,6),
            'gradientboostingclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        n_iter=30,
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        random_state=9597)

    _ = gbc_cv.fit(X_train, y_train)
In [49]:
if not 'svc.joblib' in os.listdir('../working/best_estimators'):

    svc_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, svc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'svc__C':np.arange(1,11,1),
            'svc__kernel':['rbf','sigmoid']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=False)
    
    _ = svc_cv.fit(X_train, y_train)
In [50]:
# Check if the best_estimators folder is empty. 
if len(os.listdir(os.path.join(dirname, 'working/best_estimators'))) == 0:

    # Create a list with the GridSearchCV best estimators.
    best_estimators = [
        log_cv.best_estimator_,
        l1_cv.best_estimator_, 
        l2_cv.best_estimator_,
        net_cv.best_estimator_,
        sgd_cv.best_estimator_,
        mlp_cv.best_estimator_,
        dtc_cv.best_estimator_,
        rfc_cv.best_estimator_,
        etc_cv.best_estimator_, 
        ets_cv.best_estimator_,
        abc_cv.best_estimator_,
        gpc_cv.best_estimator_,
        gbc_cv.best_estimator_,
        svc_cv.best_estimator_]
    
    # Save the best estimators to folder.
    funs.save_best_estimators(best_estimators)

else:
    # Load the best estimators from folder.
    best_estimators = funs.load_best_estimators()
In [51]:
train, validate = funs.cv_models_performance(best_estimators, Transformer, X_train, y_train, metrics, sss, best=True)
In [52]:
funs.performance_plotter(train, validate, 'Validation', color)
In [53]:
validation.append(validate)
In [54]:
validate.style.hide(axis='index')
Out[54]:
Model Accuracy Recall weighted Precision weighted AUC
logisticregression 0.416700 0.354200 0.354200 0.700400
logisticregression_l1 0.444400 0.375000 0.361100 0.685400
logisticregression_l2 0.388900 0.319400 0.267400 0.699700
logisticregression_elasticnet 0.444400 0.375000 0.361100 0.691400
sgd_l1 0.444400 0.340300 0.229700 0.760700
mlp 0.416700 0.395800 0.276400 0.678200
decisiontree 0.444400 0.409700 0.315300 0.671300
randomforest 0.472200 0.395800 0.313900 0.710800
extratree 0.444400 0.409700 0.291700 0.723300
adaboost 0.472200 0.444400 0.308300 0.777200
extratrees 0.555600 0.513900 0.465800 0.744000
gaussianprocess 0.083300 0.125000 0.010400 0.500000
gradientboosting 0.388900 0.312500 0.152200 0.626500
svc 0.444400 0.333300 0.236100 0.609700

3.5 Comparison Between Procedures¶

In [55]:
names = ['Standard Estimator',
         'Variance T (0.1)',
         'Variance T (0.2)',
         'Best Estimator']
In [56]:
funs.comparison_plotter('Accuracy', validation, names, color)
In [57]:
funs.comparison_plotter('Precision weighted', validation, names, color)
In [58]:
funs.comparison_plotter('Recall weighted', validation, names, color)
In [59]:
funs.comparison_plotter('AUC', validation, names, color)

4. Test Set Performance¶

In [60]:
train, test = funs.models_performance_train_test(best_estimators, Transformer, X_train, y_train, X_test, y_test, classes, best=True)
In [61]:
funs.performance_plotter(train, test, 'Test', color)
In [62]:
test.style.hide(axis='index')
Out[62]:
Model Accuracy Recall weighted Precision weighted AUC
logisticregression 0.310300 0.310300 0.316300 0.721700
logisticregression_l1 0.275900 0.275900 0.422400 0.734400
logisticregression_l2 0.344800 0.344800 0.405400 0.720700
logisticregression_elasticnet 0.344800 0.344800 0.405400 0.737800
sgd_l1 0.206900 0.206900 0.115000 0.628700
mlp 0.137900 0.137900 0.133600 0.627800
decisiontree 0.206900 0.206900 0.181000 0.486100
randomforest 0.310300 0.310300 0.250000 0.715900
extratree 0.172400 0.172400 0.232200 0.566100
adaboost 0.172400 0.172400 0.137900 0.538100
extratrees 0.172400 0.172400 0.187700 0.538900
gaussianprocess 0.069000 0.069000 0.004800 0.500000
gradientboosting 0.275900 0.275900 0.171300 0.605500
svc 0.206900 0.206900 0.298300 0.558800

5 Best Classifiers for Grades Prediction¶

5.1.a Best Logistic Regression –elasticnet– Classifier¶

In [63]:
# Option 1: Load the model from best_estimators using joblib.
L1 = joblib.load('../working/best_estimators/logisticregression_l2.joblib')


Overall Test Performance Report

In [64]:
funs.classification_report(L1, y_train, X_train, y_test, X_test, classes, roc_plot=False)
		 TRAIN 	 TEST

Accuracy: 	 0.929 	 0.345
Recall: 	 0.929 	 0.345
Precision: 	 0.931 	 0.405

AUC: 		 0.995 	 0.721


Test Set Classification Report

In [65]:
print(classification_report(y_test, L1.predict(X_test), zero_division=0))
              precision    recall  f1-score   support

          AA       1.00      0.33      0.50         3
          BA       1.00      0.67      0.80         3
          BB       0.67      0.67      0.67         3
          CB       0.00      0.00      0.00         2
          CC       0.27      0.75      0.40         4
          DC       0.33      0.20      0.25         5
          DD       0.14      0.14      0.14         7
        Fail       0.00      0.00      0.00         2

    accuracy                           0.34        29
   macro avg       0.43      0.34      0.34        29
weighted avg       0.41      0.34      0.34        29

In [66]:
funs.confusion_matrix_plot(L1, (X_train, X_test), (y_train, y_test))
In [67]:
funs.roc_auc_plot(L1, (X_train, X_test), (y_train, y_test), binarizer)
In [68]:
# Create logodds plot by attribute and grade.
logodds = funs.linear_coefficients(L1, 'logisticregression')
In [69]:
# Create probabilities plot by attribute and grade.
probabilities = funs.linear_coefficients(L1, 'logisticregression', proba=True)
In [70]:
# Prepare the DataFrame for plotting.
df = probabilities.melt(id_vars='Variable', var_name='Grades', value_name='Coefficient')
In [71]:
funs.probabilities_by_grade(probabilities, probabilities.columns[0:-1])

5.1.b Best Random Forest Classifier¶

In [72]:
# Option 1: Load the model from best_estimators using pickle.
RFC = joblib.load('../working/best_estimators/randomforest.joblib')
In [73]:
_ = (RFC
     # .set_params(**extra_params)
     .fit(X_train, y_train))


Overall Test Performance Report

In [74]:
funs.classification_report(RFC, y_train, X_train, y_test, X_test, classes, roc_plot=False)
		 TRAIN 	 TEST

Accuracy: 	 1.000 	 0.310
Recall: 	 1.000 	 0.310
Precision: 	 1.000 	 0.250

AUC: 		 1.000 	 0.716


Test Set Classification Report

In [75]:
print(classification_report(y_test, RFC.predict(X_test), zero_division=0))
              precision    recall  f1-score   support

          AA       1.00      0.67      0.80         3
          BA       0.00      0.00      0.00         3
          BB       0.00      0.00      0.00         3
          CB       0.00      0.00      0.00         2
          CC       0.17      0.25      0.20         4
          DC       0.25      0.20      0.22         5
          DD       0.33      0.71      0.45         7
        Fail       0.00      0.00      0.00         2

    accuracy                           0.31        29
   macro avg       0.22      0.23      0.21        29
weighted avg       0.25      0.31      0.26        29

In [76]:
funs.confusion_matrix_plot(RFC, (X_train, X_test), (y_train, y_test))
In [77]:
funs.roc_auc_plot(RFC, (X_train, X_test), (y_train, y_test), binarizer)
In [78]:
# Create features importance plot by attribute.
features = funs.tree_importance(RFC, 'randomforestclassifier')

CONCLUSION¶

In conclusion, the evaluation of various models reveals their performance on the classification task. The results demonstrate the impact of feature selection and hyperparameter optimization on model performance. The best-performing model, the Logistic Regression with l1 penalization, shows promising results in terms of accuracy, recall, precision, and AUC in comparison to the other classifiers.

Nonetheless, the performance of such model is still poor – given the fact that the tunning process is made for just one model applied to each grade leaving the rest out. If a model per grade is developed and fine-tuned better classification performances can be achieved.